In this script I’ll analyze the genetic clustering done with plink.
Calling libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(ggpubr)
## Loading required package: magrittr
Reading data
setwd('..')
path <- getwd()
setwd(paste(path, "/Results/GenClustering", sep = ""))
pca.eigenval <- read.csv("Clus.eigenval", sep = "", header = F)
pca.eigenvec <- read.csv("Clus.eigenvec", sep = "", header = F)
Naming data columns
PCnames <- sprintf("PC%s",seq(1:50))
colnames(pca.eigenvec) <- c("FID", "IID", PCnames)
Creating new column to plot hapmap pops against our samples.
pca.eigenvec$database <- pca.eigenvec$FID
pca.eigenvec$database[!(pca.eigenvec$FID == "ASW" | pca.eigenvec$FID == "CEU" | pca.eigenvec$FID == "CHB" | pca.eigenvec$FID == "CHD" |
pca.eigenvec$FID == "GIH" | pca.eigenvec$FID == "JPT" | pca.eigenvec$FID == "LWK" | pca.eigenvec$FID == "MEX" |
pca.eigenvec$FID == "MKK" | pca.eigenvec$FID == "TSI" | pca.eigenvec$FID == "YRI")] <- "ADAPT"
Because plink output is a normalized PC, clustering did not achieved very good results. Now we de-normalized the PCA by multiplying de eigenvectors by the standard deviation.
pca.eigenvec[,3:52] <- t(c(as.matrix(pca.eigenval)) * t(as.matrix(pca.eigenvec[,3:52])))
I’ll remove samples that contain the word genome (surely those are not in the face files), the word ver (those are controls from the are from the axiom arrays), the word gDNA, as well as those with the name A_103. I’ve seen that some of those are outliers in some PCs
removed <- filter(pca.eigenvec, IID == "A_103" | grepl("^genome", IID) |
grepl("^ver", IID) | grepl("^gDNA", IID)) %>%
select(contains("ID"))
removed
## FID IID
## 1 1001 genome_001
## 2 1002 genome_002
## 3 1003 genome_003
## 4 1004 genome_005
## 5 1005 genome_006
## 6 1006 genome_007
## 7 1007 genome_009
## 8 1008 genome_010
## 9 1009 genome_013
## 10 1010 genome_016
## 11 1010 genome_11002_1
## 12 1011 genome_018
## 13 1011 genome_Brian_Schimmoller_Full_20131206130740
## 14 1012 genome_019
## 15 1012 genome_Jennifer_K_Wagner_Full_20130924125453
## 16 1013 genome_020
## 17 1013 genome_JKW
## 18 1014 genome_022
## 19 1014 genome_William_Mattern_Full_20140619082357
## 20 1015 genome_024
## 21 1016 genome_026
## 22 1017 genome_027
## 23 1018 genome_028
## 24 1019 genome_033
## 25 1020 genome_038
## 26 1021 genome_039
## 27 1022 genome_042
## 28 1023 genome_047
## 29 1024 genome_049
## 30 1025 genome_051
## 31 1026 genome_053
## 32 1027 genome_056
## 33 1028 genome_061
## 34 1029 genome_062
## 35 1030 genome_064
## 36 1031 genome_074
## 37 1032 genome_076
## 38 1033 genome_079
## 39 1034 genome_081
## 40 1035 genome_083
## 41 1036 genome_094
## 42 1037 genome_096
## 43 1038 genome_097
## 44 1039 genome_101
## 45 1040 genome_104
## 46 1041 genome_105
## 47 1042 genome_107
## 48 1043 genome_109
## 49 1044 genome_112
## 50 1045 genome_114
## 51 1046 genome_117
## 52 1047 genome_118
## 53 1048 genome_119
## 54 1049 genome_120
## 55 1050 genome_121
## 56 1051 genome_122
## 57 1052 genome_123
## 58 1053 genome_127
## 59 1054 genome_132
## 60 1055 genome_137
## 61 1056 genome_139
## 62 1057 genome_141
## 63 1058 genome_142
## 64 1059 genome_144
## 65 1060 genome_147
## 66 1061 genome_154
## 67 1062 genome_158
## 68 1063 genome_173
## 69 1064 genome_186
## 70 1065 genome_195
## 71 1066 genome_196
## 72 1067 genome_197
## 73 1068 genome_200
## 74 1069 genome_201
## 75 1070 genome_202
## 76 1071 genome_203
## 77 1072 genome_204
## 78 1073 genome_205
## 79 1074 genome_206
## 80 1075 genome_207
## 81 1076 genome_208
## 82 1077 genome_209
## 83 1078 genome_210
## 84 1079 genome_211
## 85 1080 genome_212
## 86 1081 genome_213
## 87 1082 genome_214
## 88 1083 genome_215
## 89 1084 genome_216
## 90 1085 genome_217
## 91 1086 genome_218
## 92 1087 genome_219
## 93 1088 genome_220
## 94 1089 genome_221
## 95 1090 genome_222
## 96 1091 genome_223
## 97 1092 genome_224
## 98 1093 genome_225
## 99 1094 genome_226
## 100 1095 genome_227
## 101 1096 A_103
## 102 1096 genome_229
## 103 1097 genome_230
## 104 1098 genome_232
## 105 1099 genome_233
## 106 10100 genome_234
## 107 10101 genome_235
## 108 10102 genome_236
## 109 10103 genome_237
## 110 10104 genome_238
## 111 10105 genome_239
## 112 10106 genome_240
## 113 10107 genome_241
## 114 10108 genome_242
## 115 10109 genome_243
## 116 10110 genome_244
## 117 10111 genome_245
## 118 10112 genome_246
## 119 10113 genome_247
## 120 10114 genome_248
## 121 10115 genome_249
## 122 10116 genome_250
## 123 10117 genome_938
## 124 10118 A_103
## 125 10119 A_103
## 126 10120 A_103
## 127 10121 A_103
## 128 10122 A_103
## 129 10123 A_103
## 130 10124 A_103
## 131 10125 A_103
## 132 10126 A_103
## 133 AffyAxiom gDNA_103
## 134 AffyAxiom ver10
## 135 AffyAxiom ver11
## 136 AffyAxiom ver12
## 137 AffyAxiom ver13
## 138 AffyAxiom ver14
## 139 AffyAxiom ver15
## 140 AffyAxiom ver16
## 141 AffyAxiom ver17
## 142 AffyAxiom ver18
## 143 AffyAxiom ver19
## 144 AffyAxiom ver20
## 145 AffyAxiom ver21
## 146 AffyAxiom ver22
## 147 AffyAxiom ver23
## 148 AffyAxiom ver24
## 149 AffyAxiom ver25
## 150 AffyAxiom ver26
## 151 AffyAxiom ver27
## 152 AffyAxiom ver28
## 153 AffyAxiom ver29
## 154 AffyAxiom ver30
## 155 AffyAxiom ver31
## 156 AffyAxiom ver32
## 157 AffyAxiom ver33
## 158 AffyAxiom ver34
## 159 AffyAxiom ver35
## 160 AffyAxiom ver37
## 161 AffyAxiom ver39
## 162 AffyAxiom ver41
## 163 AffyAxiom ver42
## 164 AffyAxiom ver43
## 165 AffyAxiom ver44
## 166 AffyAxiom ver45
## 167 AffyAxiom ver46
## 168 AffyAxiom ver47
## 169 AffyAxiom ver48
## 170 AffyAxiom ver49
## 171 AffyAxiom ver50
## 172 AffyAxiom ver51
## 173 AffyAxiom ver52
## 174 AffyAxiom ver53
## 175 AffyAxiom ver54
## 176 AffyAxiom ver55
## 177 AffyAxiom ver56
## 178 AffyAxiom ver57
## 179 AffyAxiom ver58
## 180 AffyAxiom ver59
## 181 AffyAxiom ver60
## 182 AffyAxiom ver61
## 183 AffyAxiom ver62
## 184 AffyAxiom ver63
## 185 AffyAxiom ver64
## 186 AffyAxiom ver65
## 187 AffyAxiom ver66
## 188 AffyAxiom ver67
## 189 AffyAxiom ver68
## 190 AffyAxiom ver69
## 191 AffyAxiom ver70
## 192 AffyAxiom ver71
## 193 AffyAxiom ver72
## 194 AffyAxiom ver73
## 195 AffyAxiom ver74
## 196 AffyAxiom ver75
## 197 AffyAxiom ver76
## 198 AffyAxiom ver77
## 199 AffyAxiom ver78
## 200 AffyAxiom ver79
## 201 AffyAxiom ver80
## 202 AffyAxiom ver81
## 203 AffyAxiom ver82
## 204 AffyAxiom ver83
## 205 AffyAxiom ver84
## 206 rando genome_Amie_Atiyeh_20090419204251
## 207 rando genome_Brett_Patterson_Full_20100313124936
## 208 rando genome_Charles_Voss_Full_20100313013353
## 209 rando genome_Daris_Wells_Full_20100317110224
## 210 rando genome_Emily_Voss_Full_20100313073416
## 211 rando genome_Henry_Louis_Gates_20090402070340
## 212 rando genome_Jeannette_Henney_Full_20100228133658
pca.eigenvec <- filter(pca.eigenvec, IID != "A_103" & !grepl("^genome", IID) &
!grepl("^ver", IID) & !grepl("^gDNA", IID))
I removed 212 samples.
We also have some outliers in PC5, and PC10. Probably there are also in some other lower PCs
p1 <- ggplot(pca.eigenvec, aes(PC5)) + geom_histogram()
p2 <- ggplot(pca.eigenvec, aes(PC10)) + geom_histogram()
ggarrange(p1, p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pca.eigenvec %>% filter(PC5 > 1) %>% select(contains("ID"))
## [1] FID IID
## <0 rows> (or 0-length row.names)
pca.eigenvec %>% filter(PC10 < -0.5) %>% select(contains("ID"))
## FID IID
## 1 MKK NA21399
## 2 MKK NA21402
## 3 MKK NA21405
pca.eigenvec <- pca.eigenvec %>% filter(PC5 < 1 & PC10 > -0.5)
Some summary stats
summary(pca.eigenvec[,c(1,2,53)])
## FID IID database
## ADAPT :2536 61556 : 2 ADAPT :5133
## AffyAxiom : 920 61572 : 2 MKK : 140
## cv : 673 61634 : 2 YRI : 113
## SanAntonio: 225 62022 : 2 CEU : 112
## MKK : 140 62531 : 2 LWK : 90
## v3 : 116 62577 : 2 GIH : 88
## (Other) :1508 (Other):6106 (Other): 442
PCA screeplot
barplot(c(as.matrix(pca.eigenval)), names.arg=PCnames,
main = "Variances",
xlab = "Principal Components",
ylab = "Eigenvalue",
col = "steelblue")
Looking at the HapMap populations:
| POP | Description |
|---|---|
| ASW | African ancestry in Southwest USA |
| CEU | Utah residents with Northern and Western European ancestry from the CEPH collection |
| CHB | Han Chinese in Beijing, China |
| CHD | Chinese in Metropolitan Denver, Colorado |
| GIH | Gujarati Indians in Houston, Texas |
| JPT | Japanese in Tokyo, Japan |
| LWK | Luhya in Webuye, Kenya |
| MXL | Mexican ancestry in Los Angeles, California |
| MKK | Maasai in Kinyawa, Kenya |
| TSI | Toscani in Italia |
| YRI | Yoruba in Ibadan, Nigeria |
plothap <- function(dat, x1, y1)
{
p1 <- ggplot(dat, aes_string(x = x1, y = y1, color = "database")) +
geom_point(alpha = 0.3, size = 2) + theme_pubr()
p1 <- ggpar(p1, palette = "jama")
return(p1)
}
p1 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC1", "PC2")
p2 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC3", "PC4")
p3 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC5", "PC6")
p4 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC7", "PC8")
p5 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC9", "PC10")
p6 <- plothap(filter(pca.eigenvec, database != "ADAPT"), "PC11", "PC12")
ggarrange(p1, p2, p3, p4, p5, p6, nrow = 3, ncol = 2,
common.legend = TRUE, legend = "right",
align = "v")
meanshap <- filter(pca.eigenvec, database != "ADAPT") %>% group_by(database) %>%
select(contains("PC")) %>% summarise_all(funs(mean))
## Adding missing grouping variables: `database`
plothapmeans <- function(dat, x1, y1){
p1 <- ggplot(dat, aes_string(x = x1, y = y1, label = "database")) +
geom_label(aes(fill = database), colour = "white", fontface = "bold", alpha = 0.6) +
theme_pubr(legend = "none")
p1 <- ggpar(p1, palette = "jama")
return(p1)
}
p1 <- plothapmeans(meanshap, "PC1", "PC2")
p2 <- plothapmeans(meanshap, "PC3", "PC4")
p3 <- plothapmeans(meanshap, "PC5", "PC6")
p4 <- plothapmeans(meanshap, "PC7", "PC8")
p5 <- plothapmeans(meanshap, "PC9", "PC10")
p6 <- plothapmeans(meanshap, "PC11", "PC12")
ggarrange(p1, p2, p3, p4, p5, p6,
nrow = 3, ncol = 2,
align = "v")
If we plot our samples together with the HapMap, we see the following
plothapsamp <- function(dat1, dat2, x1, y1)
{
p1 <- ggplot(dat1, aes_string(x = x1, y = y1, label = "database")) +
geom_point(data = dat2,
aes_string(x = x1, y = y1), alpha = 0.3) +
geom_label(aes(fill = database), colour = "white", fontface = "bold", alpha = 0.4) +
theme_pubr(legend = "none")
p1 <- ggpar(p1, palette = "jama")
return(p1)
}
p1 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC1", "PC2")
p2 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC3", "PC4")
p3 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC5", "PC6")
p4 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC7", "PC8")
p5 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC9", "PC10")
p6 <- plothapsamp(meanshap, filter(pca.eigenvec, database == "ADAPT"), "PC11", "PC12")
ggarrange(p1, p2, p3, p4, p5, p6,
nrow = 3, ncol = 2,
align = "v")
Looking at our samples with population designation
plothapgroup <- function(dat1, dat2, x1, y1)
{
p1 <- ggplot(dat1, aes_string(x = x1, y = y1, label = "database")) +
geom_point(data = dat2, aes_string(x = x1, y = y1, color = "FID"), alpha = 0.5) +
geom_label(aes(fill = database), colour = "white", fontface = "bold", alpha = 0.5) +
theme_pubr()
p1 <- ggpar(p1, palette = "jama")
return(p1)
}
eurhap <- filter(meanshap, database == "CEU" | database == "TSI")
eursamp <- filter(pca.eigenvec, FID == "Italian" | FID == "Polish" | FID == "Irish" |
FID == "Portuguese")
p1 <- plothapgroup(eurhap, eursamp, "PC1", "PC2")
p2 <- plothapgroup(eurhap, eursamp, "PC3", "PC4")
p3 <- plothapgroup(eurhap, eursamp, "PC5", "PC6")
p4 <- plothapgroup(eurhap, eursamp, "PC7", "PC8")
p5 <- plothapgroup(eurhap, eursamp, "PC9", "PC10")
p6 <- plothapgroup(eurhap, eursamp, "PC11", "PC12")
ggarrange(p1, p2, p3, p4, p5, p6,
nrow = 3, ncol = 2,
common.legend = TRUE, legend = "right",
align = "v")
For an initial clustering attempt I’ll group the HapMap populations into continents, as shown in the table:
| POP | Continent |
|---|---|
| ASW | Africa |
| CEU | Europe |
| CHB | Asia |
| CHD | Asia |
| GIH | American |
| JPT | Asia |
| LWK | Africa |
| MXL | American |
| MKK | Africa |
| TSI | Europe |
| YRI | Africa |
pca.eigenvec$continent[(pca.eigenvec$database == "CEU" | pca.eigenvec$database == "TSI")] <- "Europe"
pca.eigenvec$continent[(pca.eigenvec$database == "ASW" | pca.eigenvec$database == "LWK" |
pca.eigenvec$database == "MKK" | pca.eigenvec$database == "YRI")] <- "Africa"
pca.eigenvec$continent[(pca.eigenvec$database == "CHB" | pca.eigenvec$database == "CHD" |
pca.eigenvec$database == "JPT")] <- "Asia"
pca.eigenvec$continent[(pca.eigenvec$database == "GIH" | pca.eigenvec$database == "MEX")] <- "America"
If we look at scatter plots with the continent variable
plotcont <- function(dat, x1, y1)
{
p1 <- ggplot(dat, aes_string(x = x1, y = y1, color = "continent")) +
geom_point(alpha = 0.3, size = 2) + theme_pubr()
p1 <- ggpar(p1, palette = "jama")
return(p1)
}
p1 <- plotcont(filter(pca.eigenvec, database != "ADAPT"), "PC1", "PC2")
p2 <- plotcont(filter(pca.eigenvec, database != "ADAPT"), "PC3", "PC4")
p3 <- plotcont(filter(pca.eigenvec, database != "ADAPT"), "PC5", "PC6")
p4 <- plotcont(filter(pca.eigenvec, database != "ADAPT"), "PC7", "PC8")
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2,
common.legend = TRUE, legend = "right",
align = "v")
I’ll use the HapMap samples as training, with a 5-fold cross-validation, and predict the values of our samples.
library(caret)
## Loading required package: lattice
set.seed(10)
train <- pca.eigenvec %>% filter(database != "ADAPT")
test <- pca.eigenvec %>% filter(database == "ADAPT")
# 5-fold repeated cross-validation
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
svm.fit <- train(continent ~ PC1 + PC2 + PC3 + PC4 + PC5, method = 'svmRadial',
trControl = fitControl, data = train)
## Loading required package: kernlab
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
svm.fit
## Support Vector Machines with Radial Basis Function Kernel
##
## 985 samples
## 5 predictor
## 4 classes: 'Africa', 'America', 'Asia', 'Europe'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 788, 788, 787, 789, 788, 788, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9979685 0.9971561
## 0.50 0.9979685 0.9971561
## 1.00 0.9979685 0.9971561
##
## Tuning parameter 'sigma' was held constant at a value of 0.8521666
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.8521666 and C = 0.25.
test$continent <- predict(svm.fit, test)
summary(test$continent)
## Africa America Asia Europe
## 1185 647 215 3086
Scatter plots of the predicted continent groups
p1 <- plotcont(test, "PC1", "PC2")
p2 <- plotcont(test, "PC3", "PC4")
p3 <- plotcont(test, "PC5", "PC6")
p4 <- plotcont(test, "PC7", "PC8")
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2,
common.legend = TRUE, legend = "right",
align = "v")
The next step will be to create a clustering of suthern versus northern european populations, using CEU and TSI as training sets. We’ll double check our clustering with the samples that we have population information (Irish, Polish, Italian, and Portuguese).
pca.eigenvec$Eur[pca.eigenvec$database == "CEU"] <- "NEruope"
pca.eigenvec$Eur[pca.eigenvec$database == "TSI"] <- "SEruope"
set.seed(10)
train.eur <- pca.eigenvec %>% filter(database == "CEU" | database == "TSI")
test.eur <- filter(test, continent == "Europe")
# 5-fold repeated cross-validation
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
svm.fit.eur <- train(Eur ~ PC1 + PC2 + PC3 + PC4 + PC5, method = 'svmRadial',
trControl = fitControl, data = train.eur)
svm.fit.eur
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 5 predictor
## 2 classes: 'NEruope', 'SEruope'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 160, 160, 160, 160, 160, 161, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9160019 0.8304345
## 0.50 0.9170044 0.8323780
## 1.00 0.9129787 0.8240120
##
## Tuning parameter 'sigma' was held constant at a value of 0.2075051
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.2075051 and C = 0.5.
test.eur$continent <- predict(svm.fit.eur, test.eur)
summary(test.eur$continent)
## NEruope SEruope
## 1902 1184
p1 <- plotcont(test.eur, "PC1", "PC2")
p2 <- plotcont(test.eur, "PC3", "PC4")
p3 <- plotcont(test.eur, "PC5", "PC6")
p4 <- plotcont(test.eur, "PC7", "PC8")
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2,
common.legend = TRUE, legend = "right",
align = "v")
levels(test$continent) <- c("Africa", "America", "Asia", "Europe", "NEruope", "SEruope")
test[test$continent == 'Europe', 54] <- test.eur$continent
test$Eur <- NULL
test <- droplevels(test)
setwd('..')
path <- getwd()
setwd(paste(path, "/Results/PCA", sep = ""))
getwd()
## [1] "C:/Users/tomas/Box Sync/Research/FacialSD/Results/PCA"
#Read data
coeffs <- read.csv("coeffs.csv")
colnames(coeffs)[1] <- "IID"
total <- merge(test, coeffs, by = "IID")
FacePCA <- total[,59:145]
GenPCA <- total[,3:52]
Covariates <- total[,c(1,2,55:58,54)]
PCnames <- sprintf("PC%s",seq(1:87))
colnames(FacePCA) <- PCnames
PCnames <- sprintf("PC%s",seq(1:50))
colnames(GenPCA) <- PCnames
MergedDat <- list(Covariates, FacePCA, GenPCA)
setwd('..')
path <- getwd()
setwd(paste(path, "/Results/MergedData", sep = ""))
save(MergedDat, file = "MergedDat.RData")